library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(boot)
library(ggplot2)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ✔ purrr 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(lmboot)
## Warning: package 'lmboot' was built under R version 4.2.3
library(lattice)
##
## Attaching package: 'lattice'
##
## The following object is masked from 'package:boot':
##
## melanoma
library(caret)
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(naniar)
library(utils)
library(stats)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.2.3
## corrplot 0.92 loaded
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.2.3
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
##
## The following object is masked from 'package:boot':
##
## logit
library(olsrr)
## Warning: package 'olsrr' was built under R version 4.2.3
##
## Attaching package: 'olsrr'
##
## The following object is masked from 'package:datasets':
##
## rivers
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(GGally)
life = read.csv('https://github.com/athibeaux/MSDS-DS6372/raw/main/Life_Expectancy.csv')
ggplot(data = life) + geom_point(mapping = aes(x = GDP, y = Life.expectancy))
## Warning: Removed 453 rows containing missing values (`geom_point()`).
Upon looking at the graph of the original data set, it appears that
there needs to be a log transformation on the X or the GDP as we are
interested in seeing the relation between Life Expenctancy and GDP.
str(life)
## 'data.frame': 2938 obs. of 22 variables:
## $ Country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Year : int 2015 2014 2013 2012 2011 2010 2009 2008 2007 2006 ...
## $ Status : chr "Developing" "Developing" "Developing" "Developing" ...
## $ Life.expectancy : num 65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
## $ Adult.Mortality : int 263 271 268 272 275 279 281 287 295 295 ...
## $ infant.deaths : int 62 64 66 69 71 74 77 80 82 84 ...
## $ Alcohol : num 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
## $ percentage.expenditure : num 71.3 73.5 73.2 78.2 7.1 ...
## $ Hepatitis.B : int 65 62 64 67 68 66 63 64 63 64 ...
## $ Measles : int 1154 492 430 2787 3013 1989 2861 1599 1141 1990 ...
## $ BMI : num 19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
## $ under.five.deaths : int 83 86 89 93 97 102 106 110 113 116 ...
## $ Polio : int 6 58 62 67 68 66 63 64 63 58 ...
## $ Total.expenditure : num 8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
## $ Diphtheria : int 65 62 64 67 68 66 63 64 63 58 ...
## $ HIV.AIDS : num 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
## $ GDP : num 584.3 612.7 631.7 670 63.5 ...
## $ Population : num 33736494 327582 31731688 3696958 2978599 ...
## $ thinness..1.19.years : num 17.2 17.5 17.7 17.9 18.2 18.4 18.6 18.8 19 19.2 ...
## $ thinness.5.9.years : num 17.3 17.5 17.7 18 18.2 18.4 18.7 18.9 19.1 19.3 ...
## $ Income.composition.of.resources: num 0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
## $ Schooling : num 10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
vis_miss(life)
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## ℹ Please use `gather()` instead.
## ℹ The deprecated feature was likely used in the visdat package.
## Please report the issue at <]8;;https://github.com/ropensci/visdat/issueshttps://github.com/ropensci/visdat/issues]8;;>.
dim(life)
## [1] 2938 22
#View(life)
#sum(is.na(life))
#GDP 15% [17] keep GDP to have it Imputed even if quite high percentage, assuming it is crucial to predicting Life.expectancy as richer countries have better health access/Medicine and tech. The numbers appear to be GDP per capita which helps as it addresses GDP/Population. GDP per Capita and Population would be too closely related and prob attribute to covariance.
#Adjusting text angle to vis_miss
imputeMedian= preProcess(life[,-c(1:4,9)],method="medianImpute") #predictors 1:4, 9 and response is 4
cleandataMedian = predict(imputeMedian,newdata=life)
dim(cleandataMedian)
## [1] 2938 22
vis_miss(cleandataMedian) + theme(axis.text.x = element_text(angle = 90, hjust = 0))
#Literature says that over 10% missing data can contribute to bias
#HepatitsB [9] at 19% , Population 22% [18].
#Removing columns 9 and 18
cleandataMedian = cleandataMedian[,-c(18,9)]
vis_miss(cleandataMedian) + theme(axis.text.x = element_text(angle = 90, hjust = 0))
#removing last NA
cleandataMedian = na.omit(cleandataMedian)
vis_miss(cleandataMedian) + theme(axis.text.x = element_text(angle = 90, hjust = 0))
Creating Variable of Logged GDP Attribute
ggplot(data = cleandataMedian) + geom_point(mapping = aes(x = GDP, y = Life.expectancy))
#Converting GDP to Log
cleandataMedian$logGDP = log(cleandataMedian$GDP)
#converting Life.expectancy to log
cleandataMedian$logLife.expectancy = log(cleandataMedian$Life.expectancy)
#Log transformation on GDP
ggplot(data = cleandataMedian) + geom_point(mapping = aes(x = logGDP, y = logLife.expectancy))
Splitting the data
set.seed(1234)
trainIndex<-createDataPartition(cleandataMedian$Life.expectancy,p=.8,list=F) #p: proportion of data in train
training<-cleandataMedian[trainIndex,]
validate<-cleandataMedian[-trainIndex,]
Before and after log transforming GDP, with cleandataMedian:
ggplot(data = training) + geom_point(mapping = aes(x = GDP, y = Life.expectancy))
#Log transformation on GDP
ggplot(data = training) + geom_point(mapping = aes(x = log(GDP), y = Life.expectancy))
# EDA
#Creating the World
#library(ggplot2)
#library(tidyverse)
#library(ggthemes)
#world_map = map_data("world") %>% filter(! long > 180)
#countries = world_map %>% distinct(region) %>% rowid_to_column()
#countries %>% ggplot(aes(fill = rowid, map_id = region)) + geom_map(map = world_map) + expand_limits(x = world_map$long, y = world_map$lat) + coord_map("moll") +theme_map()
library(ggplot2)
library(tidyverse)
#rename training for the map
dataforcolmap = training
#but first renaming column Country in dataforcolmap
colnames(dataforcolmap)[1] = "region"
#Renaming United States of America and Boliva (Republic...) with USA and Boliva and other countries as follows
dataforcolmap$region[dataforcolmap$region == "United States of America"] = "USA"
dataforcolmap$region[dataforcolmap$region == "Bolivia (Plurinational State of)"] = "Bolivia"
dataforcolmap$region[dataforcolmap$region == "Venezuela (Bolivarian Republic of)"] = "Venezuela"
dataforcolmap$region[dataforcolmap$region == "Republic of Korea"] = "South Korea"
dataforcolmap$region[dataforcolmap$region == "The former Yugoslav republic of Macedonia"] = "North Macedonia"
dataforcolmap$region[dataforcolmap$region == "Republic of Moldova"] = "Moldova"
dataforcolmap$region[dataforcolmap$region == "Russian Federation"] = "Russia"
dataforcolmap$region[dataforcolmap$region == "Micronesia (Federated States of)"] = "Micronesia"
dataforcolmap$region[dataforcolmap$region == "Lao People's Democratic Republic"] = "Laos"
dataforcolmap$region[dataforcolmap$region == "Iran (Islamic Republic of)"] = "Iran"
dataforcolmap$region[dataforcolmap$region == "Democratic People's Republic of Korea"] = "North Korea"
view(dataforcolmap)
#getting map data for plotting
mapdata = map_data("world")
#view(mapdata)
#joining map data with dataforcolmap
mapdata = left_join(mapdata,dataforcolmap, by = "region")
#view(mapdata)
#filtering out NAs for life expectancy , status, Income.composition.of.resources
#Life Exp
mapdata1 = mapdata %>% filter(!is.na(mapdata$Life.expectancy))
#Status
mapdata2 = mapdata %>% filter(!is.na(mapdata$Status))
#Income
mapdata3 = mapdata %>% filter(!is.na(mapdata$Income.composition.of.resources))
#mapping mapdata1 for Life Exp
map1 = ggplot(mapdata1, aes(x = long, y = lat, group = group)) + geom_polygon(aes(fill = Life.expectancy), color = "black")+ theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(), axis.title.y = element_blank(), axis.title.x = element_blank()) + ggtitle("Life Expectancy per Country") + scale_fill_gradient(low = "red", high = "yellow") + guides(fill=guide_legend(title="Life Expectancy"))
map1
#mapping mapdata2 for Status
mapStatus = ggplot(mapdata2, aes(x = long, y = lat, group = group)) + geom_polygon(aes(fill = Status, col = "orange"), color = "black") + theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(), axis.title.y = element_blank(), axis.title.x = element_blank()) + ggtitle("Country's Status: Developed v. Developing")
mapStatus
#mapping mapdata3 for Income Composition of Resources
mapIncome = ggplot(mapdata3, aes(x = long, y = lat, group = group)) + geom_polygon(aes(fill = Income.composition.of.resources), color = "black")+ theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(), axis.title.y = element_blank(), axis.title.x = element_blank()) + ggtitle("Income Composition of Resources per Country") + scale_fill_gradient(low = "red", high = "yellow")+ guides(fill=guide_legend(title="Income Composition of Resources"))
mapIncome
cor <- cor(training[,c(4,5:20)])
corrplot(cor, method = "square", tl.srt = 50, tl.col = "black", tl.cex = 0.6, title = "Correlation of Variables", mar=c(0,0,1,0))
GGPairs:
#commented out for knitting
#ggpairs(training[,4:20])
# Libraries Used: ISLR, data.table, GGally, ggplot2
ggpairs(training[,4:8], lower = list(continuous = wrap("points", color = "red", alpha = 0.5), combo = wrap("box", color = "orange", alpha = 0.3), discrete = wrap("facetbar", color = "yellow", alpha = 0.3) ),diag = list(continuous = wrap("densityDiag", color = "blue", alpha = 0.5)))
ggpairs(training[,c(4,9:12)], lower = list(continuous = wrap("points", color = "red", alpha = 0.5), combo = wrap("box", color = "orange", alpha = 0.3), discrete = wrap("facetbar", color = "yellow", alpha = 0.3) ),diag = list(continuous = wrap("densityDiag", color = "blue", alpha = 0.5)))
ggpairs(training[,c(4,13:16)], lower = list(continuous = wrap("points", color = "red", alpha = 0.5), combo = wrap("box", color = "orange", alpha = 0.3), discrete = wrap("facetbar", color = "yellow", alpha = 0.3) ),diag = list(continuous = wrap("densityDiag", color = "blue", alpha = 0.5)))
ggpairs(training[,c(4,17:20)], lower = list(continuous = wrap("points", color = "red", alpha = 0.5), combo = wrap("box", color = "orange", alpha = 0.3), discrete = wrap("facetbar", color = "yellow", alpha = 0.3) ),diag = list(continuous = wrap("densityDiag", color = "blue", alpha = 0.5)))
#ggpairs(cleandataMedian[,5:22], upper = list(continuous = wrap("cor", size = 4.75, align_percent = 1)))
#ggscatmat(cleandataMedian, columns = 4:10)
Full Model
KitchenSink <- lm(Life.expectancy~Year + Status + Adult.Mortality +
infant.deaths + Alcohol + percentage.expenditure +
Measles + BMI + under.five.deaths + Polio +
Total.expenditure + Diphtheria + HIV.AIDS + log(GDP) +
thinness..1.19.years + thinness.5.9.years +
Income.composition.of.resources + Schooling, training)
plot(KitchenSink)
vif(KitchenSink)
## Year Status
## 1.147699 1.844672
## Adult.Mortality infant.deaths
## 1.791904 170.003056
## Alcohol percentage.expenditure
## 1.908148 1.669605
## Measles BMI
## 1.416355 1.760019
## under.five.deaths Polio
## 172.685555 1.993205
## Total.expenditure Diphtheria
## 1.204580 2.033602
## HIV.AIDS log(GDP)
## 1.486816 2.081129
## thinness..1.19.years thinness.5.9.years
## 8.502136 8.625215
## Income.composition.of.resources Schooling
## 3.206133 3.559931
full.model <- lm(Life.expectancy~ Status + Alcohol + percentage.expenditure +
Measles + BMI + under.five.deaths + Polio +
Total.expenditure + Diphtheria + HIV.AIDS + log(GDP) +
thinness..1.19.years +
Income.composition.of.resources + Schooling, training)
summary(full.model)
##
## Call:
## lm(formula = Life.expectancy ~ Status + Alcohol + percentage.expenditure +
## Measles + BMI + under.five.deaths + Polio + Total.expenditure +
## Diphtheria + HIV.AIDS + log(GDP) + thinness..1.19.years +
## Income.composition.of.resources + Schooling, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.4131 -2.6858 0.1732 2.7454 19.5163
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.660e+01 8.279e-01 56.283 < 2e-16 ***
## StatusDeveloping -2.284e+00 3.362e-01 -6.794 1.38e-11 ***
## Alcohol -5.985e-02 3.282e-02 -1.824 0.0683 .
## percentage.expenditure 1.527e-04 6.299e-05 2.425 0.0154 *
## Measles -9.719e-06 9.525e-06 -1.020 0.3077
## BMI 5.473e-02 6.262e-03 8.740 < 2e-16 ***
## under.five.deaths -9.013e-04 7.723e-04 -1.167 0.2433
## Polio 2.864e-02 5.610e-03 5.106 3.56e-07 ***
## Total.expenditure 1.013e-01 4.285e-02 2.365 0.0181 *
## Diphtheria 4.782e-02 5.615e-03 8.515 < 2e-16 ***
## HIV.AIDS -6.789e-01 1.976e-02 -34.355 < 2e-16 ***
## log(GDP) 4.448e-01 7.848e-02 5.668 1.62e-08 ***
## thinness..1.19.years -6.931e-02 3.015e-02 -2.299 0.0216 *
## Income.composition.of.resources 7.455e+00 8.198e-01 9.094 < 2e-16 ***
## Schooling 7.672e-01 5.482e-02 13.994 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.583 on 2329 degrees of freedom
## Multiple R-squared: 0.7687, Adjusted R-squared: 0.7673
## F-statistic: 552.9 on 14 and 2329 DF, p-value: < 2.2e-16
plot(full.model)
vif(full.model)
## Status Alcohol
## 1.816727 1.809490
## percentage.expenditure Measles
## 1.659251 1.394714
## BMI under.five.deaths
## 1.717403 1.728815
## Polio Total.expenditure
## 1.984437 1.188715
## Diphtheria HIV.AIDS
## 1.996048 1.131097
## log(GDP) thinness..1.19.years
## 2.054859 1.956459
## Income.composition.of.resources Schooling
## 3.102486 3.510213
set.seed(2345)
eightVar = lm(Life.expectancy~HIV.AIDS+Schooling+Alcohol+BMI+Polio+Diphtheria+logGDP+thinness..1.19.years+Income.composition.of.resources, data = training)
summary(eightVar)
##
## Call:
## lm(formula = Life.expectancy ~ HIV.AIDS + Schooling + Alcohol +
## BMI + Polio + Diphtheria + logGDP + thinness..1.19.years +
## Income.composition.of.resources, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.4141 -2.5857 0.1184 2.7623 19.2743
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 43.206803 0.625395 69.087 < 2e-16 ***
## HIV.AIDS -0.681485 0.019905 -34.236 < 2e-16 ***
## Schooling 0.807629 0.055413 14.575 < 2e-16 ***
## Alcohol 0.048487 0.030055 1.613 0.107
## BMI 0.055012 0.006301 8.731 < 2e-16 ***
## Polio 0.029326 0.005694 5.151 2.82e-07 ***
## Diphtheria 0.048879 0.005683 8.601 < 2e-16 ***
## logGDP 0.579798 0.070900 8.178 4.69e-16 ***
## thinness..1.19.years -0.113199 0.027585 -4.104 4.21e-05 ***
## Income.composition.of.resources 7.810187 0.825457 9.462 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.657 on 2334 degrees of freedom
## Multiple R-squared: 0.7607, Adjusted R-squared: 0.7598
## F-statistic: 824.6 on 9 and 2334 DF, p-value: < 2.2e-16
confint(eightVar)
## 2.5 % 97.5 %
## (Intercept) 41.98041532 44.43319143
## HIV.AIDS -0.72051911 -0.64245093
## Schooling 0.69896506 0.91629379
## Alcohol -0.01045042 0.10742491
## BMI 0.04265642 0.06736755
## Polio 0.01816053 0.04049132
## Diphtheria 0.03773517 0.06002214
## logGDP 0.44076547 0.71883070
## thinness..1.19.years -0.16729240 -0.05910629
## Income.composition.of.resources 6.19148283 9.42889208
#Visuals for Residuals
plot(eightVar)
#Best final MLR
set.seed(2323)
lessvar = lm(Life.expectancy~HIV.AIDS+Schooling+BMI+Diphtheria+logGDP+thinness..1.19.years+Income.composition.of.resources, data = training)
summary(lessvar)
##
## Call:
## lm(formula = Life.expectancy ~ HIV.AIDS + Schooling + BMI + Diphtheria +
## logGDP + thinness..1.19.years + Income.composition.of.resources,
## data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.382 -2.545 0.095 2.768 20.119
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 43.642032 0.618828 70.524 < 2e-16 ***
## HIV.AIDS -0.680044 0.019876 -34.215 < 2e-16 ***
## Schooling 0.844081 0.054243 15.561 < 2e-16 ***
## BMI 0.056115 0.006332 8.862 < 2e-16 ***
## Diphtheria 0.067500 0.004437 15.211 < 2e-16 ***
## logGDP 0.599579 0.070806 8.468 < 2e-16 ***
## thinness..1.19.years -0.123684 0.026922 -4.594 4.58e-06 ***
## Income.composition.of.resources 7.952176 0.829769 9.584 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.684 on 2336 degrees of freedom
## Multiple R-squared: 0.7578, Adjusted R-squared: 0.757
## F-statistic: 1044 on 7 and 2336 DF, p-value: < 2.2e-16
confint(lessvar)
## 2.5 % 97.5 %
## (Intercept) 42.42852263 44.85554107
## HIV.AIDS -0.71901981 -0.64106761
## Schooling 0.73771169 0.95044987
## BMI 0.04369781 0.06853138
## Diphtheria 0.05879867 0.07620224
## logGDP 0.46073069 0.73842704
## thinness..1.19.years -0.17647798 -0.07089008
## Income.composition.of.resources 6.32501538 9.57933583
#Visuals for Residuals
plot(lessvar)
Feature Selection Tools: Penalized Regression
# Penalized Regression
#Setting kfold parameters
fitControl<-trainControl(method="repeatedcv",number=5,repeats=1)
#Fitting glmnet
set.seed(1234)
glmnet.fit<-train(Life.expectancy~infant.deaths + Alcohol +
percentage.expenditure + Measles + BMI +
under.five.deaths + Polio + Total.expenditure +
Diphtheria + HIV.AIDS + log(GDP) +
thinness..1.19.years + thinness.5.9.years +
Income.composition.of.resources + Schooling,
data=training,
method="glmnet",
trControl=fitControl
)
glmnet.fit
## glmnet
##
## 2344 samples
## 15 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 1 times)
## Summary of sample sizes: 1875, 1877, 1874, 1875, 1875
## Resampling results across tuning parameters:
##
## alpha lambda RMSE Rsquared MAE
## 0.10 0.01359526 4.546733 0.7715121 3.428469
## 0.10 0.13595255 4.617190 0.7645704 3.477897
## 0.10 1.35952554 4.677957 0.7611292 3.543112
## 0.55 0.01359526 4.546678 0.7715357 3.427825
## 0.55 0.13595255 4.642083 0.7621296 3.494318
## 0.55 1.35952554 4.837724 0.7583217 3.646672
## 1.00 0.01359526 4.547465 0.7714789 3.427827
## 1.00 0.13595255 4.645565 0.7619759 3.493011
## 1.00 1.35952554 5.132045 0.7483316 3.890102
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0.55 and lambda = 0.01359526.
plot(glmnet.fit)
#Investigating coefficients
opt.pen<-glmnet.fit$finalModel$lambdaOpt #penalty term
coef(glmnet.fit$finalModel,opt.pen)
## 16 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 4.460994e+01
## infant.deaths 7.072859e-02
## Alcohol 4.933926e-02
## percentage.expenditure 2.592483e-04
## Measles -6.563182e-06
## BMI 5.356624e-02
## under.five.deaths -5.267115e-02
## Polio 2.688020e-02
## Total.expenditure 1.322282e-01
## Diphtheria 4.334536e-02
## HIV.AIDS -6.760276e-01
## log(GDP) 4.439098e-01
## thinness..1.19.years -9.170910e-02
## thinness.5.9.years .
## Income.composition.of.resources 7.354446e+00
## Schooling 7.755679e-01
glmnet.fit.model <-lm(Life.expectancy~infant.deaths + Alcohol +
percentage.expenditure + Measles + BMI +
under.five.deaths + Polio + Total.expenditure +
Diphtheria + HIV.AIDS + log(GDP) +
thinness..1.19.years +
Income.composition.of.resources + Schooling,
data=training)
plot(glmnet.fit.model)
#Lets force a LASSO model and add complexity
set.seed(1234)
glmnet.fit2<-train(Life.expectancy~infant.deaths + Alcohol +
percentage.expenditure + Measles + BMI +
under.five.deaths + poly(Polio,2) + Total.expenditure +
Diphtheria + HIV.AIDS + log(GDP) +
thinness..1.19.years + thinness.5.9.years +
Income.composition.of.resources + Schooling,
data=training,
method="glmnet",
trControl=fitControl,
tuneGrid=expand.grid(data.frame(alpha=1,lambda=seq(0,.05,.001)))
)
glmnet.fit2
## glmnet
##
## 2344 samples
## 15 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 1 times)
## Summary of sample sizes: 1875, 1877, 1874, 1875, 1875
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.000 4.438996 0.7821622 3.353113
## 0.001 4.438996 0.7821622 3.353113
## 0.002 4.439178 0.7821469 3.353265
## 0.003 4.439604 0.7821102 3.353665
## 0.004 4.439905 0.7820844 3.354178
## 0.005 4.440391 0.7820411 3.354927
## 0.006 4.441047 0.7819817 3.355913
## 0.007 4.441877 0.7819056 3.356904
## 0.008 4.442881 0.7818127 3.357980
## 0.009 4.444052 0.7817038 3.359160
## 0.010 4.445390 0.7815786 3.360345
## 0.011 4.446942 0.7814329 3.361585
## 0.012 4.448665 0.7812705 3.362960
## 0.013 4.450543 0.7810932 3.364494
## 0.014 4.452643 0.7808933 3.366372
## 0.015 4.454931 0.7806745 3.368319
## 0.016 4.457387 0.7804391 3.370265
## 0.017 4.460005 0.7801876 3.372391
## 0.018 4.462793 0.7799192 3.374614
## 0.019 4.465766 0.7796322 3.376937
## 0.020 4.468895 0.7793297 3.379295
## 0.021 4.472143 0.7790153 3.381720
## 0.022 4.475542 0.7786857 3.384205
## 0.023 4.479080 0.7783420 3.386881
## 0.024 4.482819 0.7779772 3.389618
## 0.025 4.486731 0.7775947 3.392610
## 0.026 4.490825 0.7771933 3.395700
## 0.027 4.495096 0.7767735 3.398865
## 0.028 4.499526 0.7763374 3.402101
## 0.029 4.504118 0.7758847 3.405358
## 0.030 4.508868 0.7754155 3.408641
## 0.031 4.513601 0.7749468 3.411809
## 0.032 4.517809 0.7745270 3.414992
## 0.033 4.522146 0.7740937 3.418897
## 0.034 4.526346 0.7736733 3.422622
## 0.035 4.529756 0.7733298 3.425748
## 0.036 4.533274 0.7729749 3.428874
## 0.037 4.536688 0.7726295 3.431850
## 0.038 4.538484 0.7724495 3.433561
## 0.039 4.539767 0.7723218 3.434705
## 0.040 4.541073 0.7721918 3.435852
## 0.041 4.542062 0.7720926 3.436751
## 0.042 4.542531 0.7720457 3.437187
## 0.043 4.542508 0.7720491 3.437149
## 0.044 4.542487 0.7720524 3.437110
## 0.045 4.542467 0.7720556 3.437071
## 0.046 4.542450 0.7720586 3.437032
## 0.047 4.542435 0.7720615 3.436993
## 0.048 4.542421 0.7720644 3.436953
## 0.049 4.542409 0.7720671 3.436915
## 0.050 4.542399 0.7720696 3.436881
##
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.001.
plot(glmnet.fit2)
opt.pen<-glmnet.fit2$finalModel$lambdaOpt #penalty term
coef(glmnet.fit2$finalModel,opt.pen)
## 17 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 5.044976e+01
## infant.deaths 1.003575e-01
## Alcohol 4.928420e-02
## percentage.expenditure 3.159795e-04
## Measles -3.689026e-06
## BMI 5.226929e-02
## under.five.deaths -7.357662e-02
## poly(Polio, 2)1 5.473565e+01
## poly(Polio, 2)2 5.245271e+01
## Total.expenditure 1.330491e-01
## Diphtheria 2.136060e-02
## HIV.AIDS -6.514221e-01
## log(GDP) 3.653079e-01
## thinness..1.19.years -1.069419e-01
## thinness.5.9.years .
## Income.composition.of.resources 6.837594e+00
## Schooling 7.006484e-01
# Different way to do GLMNET
x=model.matrix(Life.expectancy~infant.deaths + Alcohol +
percentage.expenditure + Measles + BMI +
under.five.deaths + Polio + Total.expenditure +
Diphtheria + HIV.AIDS + log(GDP) +
thinness..1.19.years + thinness.5.9.years +
Income.composition.of.resources + Schooling,
training)[,-1]
y=log(training$Life.expectancy)
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.2.3
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-7
set.seed(1234)
grid=10^seq(10,-2, length =100)
lasso.mod=glmnet(x,y,alpha=1, lambda =grid)
cv.out=cv.glmnet(x,y,alpha=1)
plot(cv.out)
bestlambda<-cv.out$lambda.1se
coef(cv.out,s=bestlambda)
## 16 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 3.844715e+00
## infant.deaths 3.126175e-04
## Alcohol 1.412272e-04
## percentage.expenditure 2.088892e-06
## Measles -2.066844e-07
## BMI 8.656727e-04
## under.five.deaths -2.420026e-04
## Polio 4.313928e-04
## Total.expenditure 1.324203e-03
## Diphtheria 7.540932e-04
## HIV.AIDS -1.150787e-02
## log(GDP) 6.791475e-03
## thinness..1.19.years -8.147405e-04
## thinness.5.9.years .
## Income.composition.of.resources 1.152774e-01
## Schooling 1.190970e-02
Feature Selection Tools: Forward Selection
fwd.selection = lm(Life.expectancy~infant.deaths + Alcohol +
percentage.expenditure + Measles + BMI +
under.five.deaths + Polio + Total.expenditure +
Diphtheria + HIV.AIDS + log(GDP) +
thinness..1.19.years + thinness.5.9.years +
Income.composition.of.resources + Schooling, data = training)
# Forward
ols_step_forward_p(fwd.selection, penter = 0.05, details = TRUE)
## Forward Selection Method
## ---------------------------
##
## Candidate Terms:
##
## 1. infant.deaths
## 2. Alcohol
## 3. percentage.expenditure
## 4. Measles
## 5. BMI
## 6. under.five.deaths
## 7. Polio
## 8. Total.expenditure
## 9. Diphtheria
## 10. HIV.AIDS
## 11. log(GDP)
## 12. thinness..1.19.years
## 13. thinness.5.9.years
## 14. Income.composition.of.resources
## 15. Schooling
##
## We are selecting variables based on p value...
##
##
## Forward Selection: Step 1
##
## - Schooling
##
## Model Summary
## --------------------------------------------------------------
## R 0.742 RMSE 6.376
## R-Squared 0.550 Coef. Var 9.217
## Adj. R-Squared 0.550 MSE 40.657
## Pred R-Squared 0.548 MAE 4.522
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## --------------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## --------------------------------------------------------------------------
## Regression 116351.776 2 58175.888 1430.894 0.0000
## Residual 95178.071 2341 40.657
## Total 211529.848 2343
## --------------------------------------------------------------------------
##
## Parameter Estimates
## ------------------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## ------------------------------------------------------------------------------------------------------------
## (Intercept) 43.597 0.507 86.019 0.000 42.603 44.591
## Income.composition.of.resources 15.377 1.093 0.329 14.063 0.000 13.233 17.521
## Schooling 1.322 0.069 0.450 19.231 0.000 1.187 1.457
## ------------------------------------------------------------------------------------------------------------
##
##
##
## Forward Selection: Step 2
##
## - HIV.AIDS
##
## Model Summary
## --------------------------------------------------------------
## R 0.838 RMSE 5.191
## R-Squared 0.702 Coef. Var 7.504
## Adj. R-Squared 0.702 MSE 26.948
## Pred R-Squared 0.700 MAE 3.727
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## --------------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## --------------------------------------------------------------------------
## Regression 148470.718 3 49490.239 1836.485 0.0000
## Residual 63059.129 2340 26.948
## Total 211529.848 2343
## --------------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------------------
## (Intercept) 48.062 0.432 111.146 0.000 47.214 48.910
## Income.composition.of.resources 11.453 0.897 0.245 12.762 0.000 9.693 13.213
## Schooling 1.267 0.056 0.431 22.619 0.000 1.157 1.376
## HIV.AIDS -0.750 0.022 -0.402 -34.523 0.000 -0.793 -0.707
## -------------------------------------------------------------------------------------------------------------
##
##
##
## Forward Selection: Step 3
##
## - Diphtheria
##
## Model Summary
## --------------------------------------------------------------
## R 0.855 RMSE 4.931
## R-Squared 0.731 Coef. Var 7.128
## Adj. R-Squared 0.731 MSE 24.314
## Pred R-Squared 0.729 MAE 3.605
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## --------------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## --------------------------------------------------------------------------
## Regression 154658.745 4 38664.686 1590.205 0.0000
## Residual 56871.102 2339 24.314
## Total 211529.848 2343
## --------------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------------------
## (Intercept) 44.282 0.474 93.387 0.000 43.352 45.212
## Income.composition.of.resources 10.312 0.855 0.221 12.055 0.000 8.635 11.989
## Schooling 1.129 0.054 0.385 20.962 0.000 1.024 1.235
## HIV.AIDS -0.722 0.021 -0.387 -34.868 0.000 -0.763 -0.682
## Diphtheria 0.074 0.005 0.186 15.953 0.000 0.065 0.083
## -------------------------------------------------------------------------------------------------------------
##
##
##
## Forward Selection: Step 4
##
## - BMI
##
## Model Summary
## --------------------------------------------------------------
## R 0.865 RMSE 4.776
## R-Squared 0.748 Coef. Var 6.904
## Adj. R-Squared 0.747 MSE 22.813
## Pred R-Squared 0.746 MAE 3.544
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## --------------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## --------------------------------------------------------------------------
## Regression 158193.664 5 31638.733 1386.889 0.0000
## Residual 53336.184 2338 22.813
## Total 211529.848 2343
## --------------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------------------
## (Intercept) 44.336 0.459 96.525 0.000 43.435 45.237
## Income.composition.of.resources 9.331 0.832 0.200 11.211 0.000 7.699 10.963
## Schooling 0.970 0.054 0.330 18.043 0.000 0.864 1.075
## HIV.AIDS -0.690 0.020 -0.370 -34.134 0.000 -0.730 -0.651
## Diphtheria 0.069 0.005 0.174 15.309 0.000 0.060 0.078
## BMI 0.074 0.006 0.154 12.448 0.000 0.062 0.086
## -------------------------------------------------------------------------------------------------------------
##
##
##
## Forward Selection: Step 5
##
## - log(GDP)
##
## Model Summary
## --------------------------------------------------------------
## R 0.869 RMSE 4.704
## R-Squared 0.756 Coef. Var 6.799
## Adj. R-Squared 0.755 MSE 22.125
## Pred R-Squared 0.753 MAE 3.506
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## --------------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## --------------------------------------------------------------------------
## Regression 159823.821 6 26637.303 1203.948 0.0000
## Residual 51706.027 2337 22.125
## Total 211529.848 2343
## --------------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------------------
## (Intercept) 42.099 0.522 80.644 0.000 41.076 43.123
## Income.composition.of.resources 8.057 0.833 0.173 9.672 0.000 6.424 9.691
## Schooling 0.872 0.054 0.297 16.098 0.000 0.765 0.978
## HIV.AIDS -0.686 0.020 -0.368 -34.417 0.000 -0.725 -0.647
## Diphtheria 0.068 0.004 0.171 15.333 0.000 0.060 0.077
## BMI 0.067 0.006 0.140 11.352 0.000 0.055 0.079
## log(GDP) 0.610 0.071 0.111 8.584 0.000 0.471 0.749
## -------------------------------------------------------------------------------------------------------------
##
##
##
## Forward Selection: Step 6
##
## - Polio
##
## Model Summary
## --------------------------------------------------------------
## R 0.871 RMSE 4.679
## R-Squared 0.758 Coef. Var 6.763
## Adj. R-Squared 0.758 MSE 21.889
## Pred R-Squared 0.756 MAE 3.491
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## --------------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## --------------------------------------------------------------------------
## Regression 160397.905 7 22913.986 1046.842 0.0000
## Residual 51131.943 2336 21.889
## Total 211529.848 2343
## --------------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------------------
## (Intercept) 41.581 0.529 78.597 0.000 40.543 42.618
## Income.composition.of.resources 7.914 0.829 0.169 9.546 0.000 6.288 9.540
## Schooling 0.855 0.054 0.291 15.850 0.000 0.749 0.961
## HIV.AIDS -0.683 0.020 -0.367 -34.475 0.000 -0.722 -0.644
## Diphtheria 0.050 0.006 0.125 8.740 0.000 0.039 0.061
## BMI 0.066 0.006 0.137 11.195 0.000 0.054 0.077
## log(GDP) 0.604 0.071 0.110 8.539 0.000 0.465 0.742
## Polio 0.029 0.006 0.073 5.121 0.000 0.018 0.041
## -------------------------------------------------------------------------------------------------------------
##
##
##
## Forward Selection: Step 7
##
## - percentage.expenditure
##
## Model Summary
## --------------------------------------------------------------
## R 0.872 RMSE 4.657
## R-Squared 0.761 Coef. Var 6.732
## Adj. R-Squared 0.760 MSE 21.689
## Pred R-Squared 0.758 MAE 3.479
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## -------------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -------------------------------------------------------------------------
## Regression 160886.865 8 20110.858 927.253 0.0000
## Residual 50642.982 2335 21.689
## Total 211529.848 2343
## -------------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------------------
## (Intercept) 42.827 0.588 72.787 0.000 41.673 43.980
## Income.composition.of.resources 7.810 0.826 0.167 9.460 0.000 6.191 9.429
## Schooling 0.839 0.054 0.286 15.592 0.000 0.733 0.944
## HIV.AIDS -0.685 0.020 -0.367 -34.709 0.000 -0.724 -0.646
## Diphtheria 0.050 0.006 0.127 8.882 0.000 0.039 0.062
## BMI 0.067 0.006 0.139 11.403 0.000 0.055 0.078
## log(GDP) 0.431 0.079 0.078 5.441 0.000 0.276 0.586
## Polio 0.029 0.006 0.074 5.161 0.000 0.018 0.041
## percentage.expenditure 0.000 0.000 0.059 4.748 0.000 0.000 0.000
## -------------------------------------------------------------------------------------------------------------
##
##
##
## Forward Selection: Step 8
##
## - thinness..1.19.years
##
## Model Summary
## --------------------------------------------------------------
## R 0.873 RMSE 4.640
## R-Squared 0.762 Coef. Var 6.707
## Adj. R-Squared 0.762 MSE 21.527
## Pred R-Squared 0.760 MAE 3.481
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## -------------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -------------------------------------------------------------------------
## Regression 161286.681 9 17920.742 832.492 0.0000
## Residual 50243.166 2334 21.527
## Total 211529.848 2343
## -------------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------------------
## (Intercept) 44.180 0.665 66.433 0.000 42.876 45.484
## Income.composition.of.resources 7.719 0.823 0.165 9.382 0.000 6.105 9.332
## Schooling 0.814 0.054 0.277 15.106 0.000 0.709 0.920
## HIV.AIDS -0.680 0.020 -0.364 -34.497 0.000 -0.718 -0.641
## Diphtheria 0.050 0.006 0.125 8.769 0.000 0.039 0.061
## BMI 0.057 0.006 0.118 8.989 0.000 0.044 0.069
## log(GDP) 0.433 0.079 0.079 5.483 0.000 0.278 0.588
## Polio 0.029 0.006 0.074 5.188 0.000 0.018 0.041
## percentage.expenditure 0.000 0.000 0.055 4.437 0.000 0.000 0.000
## thinness..1.19.years -0.115 0.027 -0.053 -4.310 0.000 -0.168 -0.063
## -------------------------------------------------------------------------------------------------------------
##
##
##
## Forward Selection: Step 9
##
## - Total.expenditure
##
## Model Summary
## --------------------------------------------------------------
## R 0.874 RMSE 4.629
## R-Squared 0.764 Coef. Var 6.691
## Adj. R-Squared 0.763 MSE 21.428
## Pred R-Squared 0.761 MAE 3.474
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## -------------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -------------------------------------------------------------------------
## Regression 161538.357 10 16153.836 753.866 0.0000
## Residual 49991.490 2333 21.428
## Total 211529.848 2343
## -------------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------------------
## (Intercept) 43.422 0.699 62.084 0.000 42.051 44.794
## Income.composition.of.resources 7.910 0.823 0.169 9.615 0.000 6.297 9.524
## Schooling 0.795 0.054 0.271 14.704 0.000 0.689 0.901
## HIV.AIDS -0.685 0.020 -0.367 -34.741 0.000 -0.724 -0.646
## Diphtheria 0.049 0.006 0.122 8.580 0.000 0.037 0.060
## BMI 0.054 0.006 0.113 8.631 0.000 0.042 0.067
## log(GDP) 0.454 0.079 0.083 5.751 0.000 0.299 0.609
## Polio 0.029 0.006 0.073 5.177 0.000 0.018 0.040
## percentage.expenditure 0.000 0.000 0.051 4.029 0.000 0.000 0.000
## thinness..1.19.years -0.102 0.027 -0.047 -3.795 0.000 -0.155 -0.049
## Total.expenditure 0.145 0.042 0.037 3.427 0.001 0.062 0.228
## -------------------------------------------------------------------------------------------------------------
##
##
##
## No more variables to be added.
##
## Variables Entered:
##
## + Income.composition.of.resources
## + Schooling
## + HIV.AIDS
## + Diphtheria
## + BMI
## + log(GDP)
## + Polio
## + percentage.expenditure
## + thinness..1.19.years
## + Total.expenditure
##
##
## Final Model Output
## ------------------
##
## Model Summary
## --------------------------------------------------------------
## R 0.874 RMSE 4.629
## R-Squared 0.764 Coef. Var 6.691
## Adj. R-Squared 0.763 MSE 21.428
## Pred R-Squared 0.761 MAE 3.474
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## -------------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -------------------------------------------------------------------------
## Regression 161538.357 10 16153.836 753.866 0.0000
## Residual 49991.490 2333 21.428
## Total 211529.848 2343
## -------------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------------------
## (Intercept) 43.422 0.699 62.084 0.000 42.051 44.794
## Income.composition.of.resources 7.910 0.823 0.169 9.615 0.000 6.297 9.524
## Schooling 0.795 0.054 0.271 14.704 0.000 0.689 0.901
## HIV.AIDS -0.685 0.020 -0.367 -34.741 0.000 -0.724 -0.646
## Diphtheria 0.049 0.006 0.122 8.580 0.000 0.037 0.060
## BMI 0.054 0.006 0.113 8.631 0.000 0.042 0.067
## log(GDP) 0.454 0.079 0.083 5.751 0.000 0.299 0.609
## Polio 0.029 0.006 0.073 5.177 0.000 0.018 0.040
## percentage.expenditure 0.000 0.000 0.051 4.029 0.000 0.000 0.000
## thinness..1.19.years -0.102 0.027 -0.047 -3.795 0.000 -0.155 -0.049
## Total.expenditure 0.145 0.042 0.037 3.427 0.001 0.062 0.228
## -------------------------------------------------------------------------------------------------------------
##
## Selection Summary
## ----------------------------------------------------------------------------------------------------
## Variable Adj.
## Step Entered R-Square R-Square C(p) AIC RMSE
## ----------------------------------------------------------------------------------------------------
## 1 Income.composition.of.resources 0.5500 0.5497 2327.9505 15341.9036 6.3763
## 2 Schooling 0.7019 0.7015 755.3715 14378.9334 5.1912
## 3 HIV.AIDS 0.7311 0.7307 454.0135 14138.8322 4.9310
## 4 Diphtheria 0.7479 0.7473 282.7199 13990.4121 4.7763
## 5 BMI 0.7556 0.7549 204.8041 13919.6529 4.7037
## 6 log(GDP) 0.7583 0.7576 178.6605 13895.4823 4.6785
## 7 Polio 0.7606 0.7598 156.6900 13874.9594 4.6571
## 8 percentage.expenditure 0.7625 0.7616 139.0897 13858.3806 4.6397
## 9 thinness..1.19.years 0.7637 0.7627 128.7517 13848.6096 4.6290
## 10 Total.expenditure NA NA NA NA NA
## ----------------------------------------------------------------------------------------------------
# Forward Selection Chosen Model
fwd.select.model <- lm(Life.expectancy~ Income.composition.of.resources + Schooling +
HIV.AIDS + Diphtheria +
BMI + log(GDP) + Polio +
percentage.expenditure + thinness..1.19.years + Total.expenditure, training)
plot(fwd.select.model)
library(leaps)
## Warning: package 'leaps' was built under R version 4.2.3
reg.fwd=regsubsets(Life.expectancy~infant.deaths + Alcohol +
percentage.expenditure + Measles + BMI +
under.five.deaths + Polio + Total.expenditure +
Diphtheria + HIV.AIDS + log(GDP) +
thinness..1.19.years + thinness.5.9.years +
Income.composition.of.resources + Schooling,
data=training,method="forward",nvmax=15)
summary(reg.fwd)$adjr2
## [1] 0.5118279 0.6808674 0.7140742 0.7338469 0.7473158 0.7549340 0.7575512
## [8] 0.7597668 0.7615613 0.7626540 0.7628746 0.7628610 0.7737752 0.7741531
## [15] 0.7740578
summary(reg.fwd)$rss
## [1] 103218.90 67448.44 60404.40 56203.20 53336.18 51706.03 51131.94
## [8] 50642.98 50243.17 49991.49 49923.61 49905.07 47587.78 47487.89
## [15] 47487.55
summary(reg.fwd)$bic
## [1] -1666.334 -2655.919 -2906.706 -3067.922 -3182.891 -3247.891 -3266.302
## [8] -3281.065 -3291.884 -3295.895 -3291.321 -3284.432 -3388.122 -3385.287
## [15] -3377.544
par(mfrow=c(1,3))
bics<-summary(reg.fwd)$bic
plot(1:15,bics,type="l",ylab="BIC",xlab="# of predictors")
index<-which(bics==min(bics))
points(index,bics[index],col="red",pch=10)
adjr2<-summary(reg.fwd)$adjr2
plot(1:15,adjr2,type="l",ylab="Adjusted R-squared",xlab="# of predictors")
index<-which(adjr2==max(adjr2))
points(index,adjr2[index],col="red",pch=10)
rss<-summary(reg.fwd)$rss
plot(1:15,rss,type="l",ylab="train RSS",xlab="# of predictors")
index<-which(rss==min(rss))
points(index,rss[index],col="red",pch=10)
Backward Selection
bck.selection = lm(Life.expectancy~infant.deaths + Alcohol +
percentage.expenditure + Measles + BMI +
under.five.deaths + Polio + Total.expenditure +
Diphtheria + HIV.AIDS + log(GDP) +
thinness..1.19.years + thinness.5.9.years +
Income.composition.of.resources + Schooling, data = training)
# Backward
ols_step_backward_p(bck.selection, prem = 0.05, details = TRUE)
## Backward Elimination Method
## ---------------------------
##
## Candidate Terms:
##
## 1 . infant.deaths
## 2 . Alcohol
## 3 . percentage.expenditure
## 4 . Measles
## 5 . BMI
## 6 . under.five.deaths
## 7 . Polio
## 8 . Total.expenditure
## 9 . Diphtheria
## 10 . HIV.AIDS
## 11 . log(GDP)
## 12 . thinness..1.19.years
## 13 . thinness.5.9.years
## 14 . Income.composition.of.resources
## 15 . Schooling
##
## We are eliminating variables based on p value...
##
## - thinness.5.9.years
##
## Backward Elimination: Step 1
##
## Variable thinness.5.9.years Removed
##
## Model Summary
## --------------------------------------------------------------
## R 0.881 RMSE 4.516
## R-Squared 0.776 Coef. Var 6.527
## Adj. R-Squared 0.774 MSE 20.390
## Pred R-Squared 0.771 MAE 3.389
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## -------------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -------------------------------------------------------------------------
## Regression 164041.953 14 11717.282 574.663 0.0000
## Residual 47487.895 2329 20.390
## Total 211529.848 2343
## -------------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------------------
## (Intercept) 45.159 0.702 64.284 0.000 43.781 46.536
## infant.deaths 0.112 0.010 1.387 10.864 0.000 0.091 0.132
## Alcohol 0.067 0.030 0.027 2.213 0.027 0.008 0.127
## percentage.expenditure 0.000 0.000 0.055 4.452 0.000 0.000 0.000
## Measles 0.000 0.000 -0.005 -0.419 0.675 0.000 0.000
## BMI 0.053 0.006 0.111 8.655 0.000 0.041 0.065
## under.five.deaths -0.083 0.008 -1.404 -10.912 0.000 -0.098 -0.068
## Polio 0.026 0.006 0.065 4.670 0.000 0.015 0.037
## Total.expenditure 0.132 0.042 0.034 3.160 0.002 0.050 0.215
## Diphtheria 0.041 0.006 0.103 7.341 0.000 0.030 0.052
## HIV.AIDS -0.671 0.019 -0.360 -34.460 0.000 -0.710 -0.633
## log(GDP) 0.443 0.077 0.081 5.723 0.000 0.291 0.594
## thinness..1.19.years -0.096 0.030 -0.044 -3.233 0.001 -0.154 -0.038
## Income.composition.of.resources 7.023 0.809 0.150 8.681 0.000 5.436 8.609
## Schooling 0.768 0.054 0.262 14.235 0.000 0.662 0.874
## -------------------------------------------------------------------------------------------------------------
##
##
## - Measles
##
## Backward Elimination: Step 2
##
## Variable Measles Removed
##
## Model Summary
## --------------------------------------------------------------
## R 0.881 RMSE 4.515
## R-Squared 0.775 Coef. Var 6.526
## Adj. R-Squared 0.774 MSE 20.383
## Pred R-Squared 0.772 MAE 3.389
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## -------------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -------------------------------------------------------------------------
## Regression 164038.369 13 12618.336 619.074 0.0000
## Residual 47491.479 2330 20.383
## Total 211529.848 2343
## -------------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------------------
## (Intercept) 45.142 0.701 64.370 0.000 43.767 46.518
## infant.deaths 0.112 0.010 1.391 10.919 0.000 0.092 0.132
## Alcohol 0.067 0.030 0.027 2.208 0.027 0.007 0.127
## percentage.expenditure 0.000 0.000 0.055 4.459 0.000 0.000 0.000
## BMI 0.054 0.006 0.112 8.710 0.000 0.042 0.066
## under.five.deaths -0.083 0.008 -1.411 -11.035 0.000 -0.098 -0.068
## Polio 0.026 0.006 0.065 4.667 0.000 0.015 0.037
## Total.expenditure 0.133 0.042 0.034 3.185 0.001 0.051 0.215
## Diphtheria 0.041 0.006 0.103 7.348 0.000 0.030 0.052
## HIV.AIDS -0.671 0.019 -0.360 -34.464 0.000 -0.709 -0.633
## log(GDP) 0.442 0.077 0.080 5.719 0.000 0.290 0.594
## thinness..1.19.years -0.095 0.030 -0.044 -3.212 0.001 -0.153 -0.037
## Income.composition.of.resources 7.035 0.808 0.151 8.704 0.000 5.450 8.620
## Schooling 0.767 0.054 0.261 14.231 0.000 0.662 0.873
## -------------------------------------------------------------------------------------------------------------
##
##
##
## No more variables satisfy the condition of p value = 0.05
##
##
## Variables Removed:
##
## - thinness.5.9.years
## - Measles
##
##
## Final Model Output
## ------------------
##
## Model Summary
## --------------------------------------------------------------
## R 0.881 RMSE 4.515
## R-Squared 0.775 Coef. Var 6.526
## Adj. R-Squared 0.774 MSE 20.383
## Pred R-Squared 0.772 MAE 3.389
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## -------------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -------------------------------------------------------------------------
## Regression 164038.369 13 12618.336 619.074 0.0000
## Residual 47491.479 2330 20.383
## Total 211529.848 2343
## -------------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------------------
## (Intercept) 45.142 0.701 64.370 0.000 43.767 46.518
## infant.deaths 0.112 0.010 1.391 10.919 0.000 0.092 0.132
## Alcohol 0.067 0.030 0.027 2.208 0.027 0.007 0.127
## percentage.expenditure 0.000 0.000 0.055 4.459 0.000 0.000 0.000
## BMI 0.054 0.006 0.112 8.710 0.000 0.042 0.066
## under.five.deaths -0.083 0.008 -1.411 -11.035 0.000 -0.098 -0.068
## Polio 0.026 0.006 0.065 4.667 0.000 0.015 0.037
## Total.expenditure 0.133 0.042 0.034 3.185 0.001 0.051 0.215
## Diphtheria 0.041 0.006 0.103 7.348 0.000 0.030 0.052
## HIV.AIDS -0.671 0.019 -0.360 -34.464 0.000 -0.709 -0.633
## log(GDP) 0.442 0.077 0.080 5.719 0.000 0.290 0.594
## thinness..1.19.years -0.095 0.030 -0.044 -3.212 0.001 -0.153 -0.037
## Income.composition.of.resources 7.035 0.808 0.151 8.704 0.000 5.450 8.620
## Schooling 0.767 0.054 0.261 14.231 0.000 0.662 0.873
## -------------------------------------------------------------------------------------------------------------
##
##
## Elimination Summary
## -------------------------------------------------------------------------------------
## Variable Adj.
## Step Removed R-Square R-Square C(p) AIC RMSE
## -------------------------------------------------------------------------------------
## 1 thinness.5.9.years 0.7755 0.7742 14.0170 13736.1797 4.5155
## 2 Measles 0.7755 0.7742 12.1927 13734.3566 4.5147
## -------------------------------------------------------------------------------------
# Backward Selection Chosen Model
bck.select.model <- lm(Life.expectancy~infant.deaths + Alcohol +
percentage.expenditure + BMI +
under.five.deaths + Polio + Total.expenditure +
Diphtheria + HIV.AIDS + log(GDP) +
thinness..1.19.years +
Income.composition.of.resources + Schooling, data = training)
plot(bck.select.model)
Stepwise Selection
sw.selection = lm(Life.expectancy~infant.deaths + Alcohol +
percentage.expenditure + Measles + BMI +
under.five.deaths + Polio + Total.expenditure +
Diphtheria + HIV.AIDS + log(GDP) +
thinness..1.19.years + thinness.5.9.years +
Income.composition.of.resources + Schooling, data = training)
summary(sw.selection)
##
## Call:
## lm(formula = Life.expectancy ~ infant.deaths + Alcohol + percentage.expenditure +
## Measles + BMI + under.five.deaths + Polio + Total.expenditure +
## Diphtheria + HIV.AIDS + log(GDP) + thinness..1.19.years +
## thinness.5.9.years + Income.composition.of.resources + Schooling,
## data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.0853 -2.5875 0.0947 2.6344 18.6763
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.515e+01 7.078e-01 63.787 < 2e-16 ***
## infant.deaths 1.115e-01 1.028e-02 10.853 < 2e-16 ***
## Alcohol 6.734e-02 3.040e-02 2.215 0.02686 *
## percentage.expenditure 2.696e-04 6.057e-05 4.451 8.96e-06 ***
## Measles -3.885e-06 9.417e-06 -0.413 0.67999
## BMI 5.349e-02 6.217e-03 8.604 < 2e-16 ***
## under.five.deaths -8.275e-02 7.588e-03 -10.906 < 2e-16 ***
## Polio 2.584e-02 5.535e-03 4.669 3.20e-06 ***
## Total.expenditure 1.327e-01 4.197e-02 3.161 0.00159 **
## Diphtheria 4.087e-02 5.571e-03 7.336 3.03e-13 ***
## HIV.AIDS -6.714e-01 1.951e-02 -34.419 < 2e-16 ***
## log(GDP) 4.431e-01 7.745e-02 5.721 1.20e-08 ***
## thinness..1.19.years -1.031e-01 6.194e-02 -1.665 0.09611 .
## thinness.5.9.years 7.969e-03 6.110e-02 0.130 0.89624
## Income.composition.of.resources 7.023e+00 8.091e-01 8.679 < 2e-16 ***
## Schooling 7.680e-01 5.398e-02 14.226 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.516 on 2328 degrees of freedom
## Multiple R-squared: 0.7755, Adjusted R-squared: 0.7741
## F-statistic: 536.1 on 15 and 2328 DF, p-value: < 2.2e-16
# Stepwise
ols_step_both_p(sw.selection, prem = 0.05, pent = 0.05, details = FALSE)
##
## Stepwise Selection Summary
## ----------------------------------------------------------------------------------------------------------------
## Added/ Adj.
## Step Variable Removed R-Square R-Square C(p) AIC RMSE
## ----------------------------------------------------------------------------------------------------------------
## 1 Income.composition.of.resources addition 0.550 0.550 2327.9510 15341.9036 6.3763
## 2 Schooling addition 0.702 0.702 755.3720 14378.9334 5.1912
## 3 HIV.AIDS addition 0.731 0.731 454.0140 14138.8322 4.9310
## 4 Diphtheria addition 0.748 0.747 282.7200 13990.4121 4.7763
## 5 BMI addition 0.756 0.755 204.8040 13919.6529 4.7037
## 6 log(GDP) addition 0.758 0.758 178.6610 13895.4823 4.6785
## 7 Polio addition 0.761 0.760 156.6900 13874.9594 4.6571
## 8 percentage.expenditure addition 0.762 0.762 139.0900 13858.3806 4.6397
## 9 thinness..1.19.years addition 0.764 0.763 128.7520 13848.6096 4.6290
## ----------------------------------------------------------------------------------------------------------------
# Stepwise Selection Chosen Model
sw.select.model = lm(Life.expectancy~ percentage.expenditure +
BMI + Polio + Diphtheria + HIV.AIDS +
log(GDP) + thinness..1.19.years +
Income.composition.of.resources + Schooling, data = training)
plot(sw.select.model)
Forward Selection Validation
fwd.train=regsubsets(Life.expectancy~infant.deaths + Alcohol +
percentage.expenditure + Measles + BMI +
under.five.deaths + Polio + Total.expenditure +
Diphtheria + HIV.AIDS + log(GDP) +
thinness..1.19.years + thinness.5.9.years +
Income.composition.of.resources + Schooling,
data=training,method="forward",nvmax=15)
#Creating a prediction function
predict.regsubsets =function (object , newdata ,id ,...){
form=as.formula (object$call [[2]])
mat=model.matrix(form ,newdata )
coefi=coef(object ,id=id)
xvars=names(coefi)
mat[,xvars]%*%coefi
}
valMSE<-c()
#note my index, i, is to 15 since that is how many predictors I went up to during fwd selection
for (i in 1:15){
predictions<-predict.regsubsets(object=fwd.train,newdata=validate,id=i)
valMSE[i]<-mean((validate$Life.expectancy-predictions)^2)
}
par(mfrow=c(1,1))
plot(1:15,sqrt(valMSE),type="l",xlab="# of predictors",
ylab="test vs train RMSE")
index<-which(valMSE==min(valMSE))
points(index,sqrt(valMSE[index]),col="red",pch=10)
trainMSE<-summary(fwd.train)$rss/nrow(training)
lines(1:15,sqrt(trainMSE),lty=3,col="blue")
# Forward, Backward, Stepwise Selection with AIC
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:olsrr':
##
## cement
## The following object is masked from 'package:dplyr':
##
## select
#Full Model
set.seed(1246)
fitFull = lm(Life.expectancy~Adult.Mortality +
infant.deaths + Alcohol + percentage.expenditure + Measles +
BMI + under.five.deaths + Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + thinness.5.9.years + Income.composition.of.resources + Schooling + logGDP,training)
stepup = stepAIC(fitFull, direction = "forward", steps = 2000)
## Start: AIC=6586.08
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol +
## percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## thinness.5.9.years + Income.composition.of.resources + Schooling +
## logGDP
stepdown= stepAIC(fitFull, direction = "backward", steps = 2000)
## Start: AIC=6586.08
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol +
## percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## thinness.5.9.years + Income.composition.of.resources + Schooling +
## logGDP
##
## Df Sum of Sq RSS AIC
## - thinness.5.9.years 1 9.3 38373 6584.6
## <none> 38364 6586.1
## - thinness..1.19.years 1 64.6 38428 6588.0
## - Measles 1 85.5 38449 6589.3
## - Total.expenditure 1 115.3 38479 6591.1
## - percentage.expenditure 1 209.3 38573 6596.8
## - Alcohol 1 227.5 38591 6597.9
## - Polio 1 360.2 38724 6606.0
## - logGDP 1 488.4 38852 6613.7
## - Diphtheria 1 869.0 39233 6636.6
## - BMI 1 884.3 39248 6637.5
## - Income.composition.of.resources 1 997.9 39362 6644.3
## - infant.deaths 1 1914.8 40279 6698.2
## - under.five.deaths 1 1950.0 40314 6700.3
## - Schooling 1 3043.1 41407 6763.0
## - HIV.AIDS 1 8282.4 46646 7042.3
## - Adult.Mortality 1 9123.8 47488 7084.2
##
## Step: AIC=6584.64
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol +
## percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## Income.composition.of.resources + Schooling + logGDP
##
## Df Sum of Sq RSS AIC
## <none> 38373 6584.6
## - Measles 1 88.3 38461 6588.0
## - Total.expenditure 1 111.9 38485 6589.5
## - thinness..1.19.years 1 125.1 38498 6590.3
## - percentage.expenditure 1 209.3 38582 6595.4
## - Alcohol 1 225.6 38599 6596.4
## - Polio 1 360.1 38733 6604.5
## - logGDP 1 482.6 38856 6611.9
## - Diphtheria 1 873.4 39246 6635.4
## - BMI 1 875.2 39248 6635.5
## - Income.composition.of.resources 1 998.5 39372 6642.9
## - infant.deaths 1 1925.1 40298 6697.4
## - under.five.deaths 1 1955.5 40329 6699.2
## - Schooling 1 3052.4 41425 6762.1
## - HIV.AIDS 1 8273.2 46646 7040.3
## - Adult.Mortality 1 9114.8 47488 7082.2
stepboth = stepAIC(fitFull, direction = "both", steps = 2000)
## Start: AIC=6586.08
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol +
## percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## thinness.5.9.years + Income.composition.of.resources + Schooling +
## logGDP
##
## Df Sum of Sq RSS AIC
## - thinness.5.9.years 1 9.3 38373 6584.6
## <none> 38364 6586.1
## - thinness..1.19.years 1 64.6 38428 6588.0
## - Measles 1 85.5 38449 6589.3
## - Total.expenditure 1 115.3 38479 6591.1
## - percentage.expenditure 1 209.3 38573 6596.8
## - Alcohol 1 227.5 38591 6597.9
## - Polio 1 360.2 38724 6606.0
## - logGDP 1 488.4 38852 6613.7
## - Diphtheria 1 869.0 39233 6636.6
## - BMI 1 884.3 39248 6637.5
## - Income.composition.of.resources 1 997.9 39362 6644.3
## - infant.deaths 1 1914.8 40279 6698.2
## - under.five.deaths 1 1950.0 40314 6700.3
## - Schooling 1 3043.1 41407 6763.0
## - HIV.AIDS 1 8282.4 46646 7042.3
## - Adult.Mortality 1 9123.8 47488 7084.2
##
## Step: AIC=6584.64
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol +
## percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## Income.composition.of.resources + Schooling + logGDP
##
## Df Sum of Sq RSS AIC
## <none> 38373 6584.6
## + thinness.5.9.years 1 9.3 38364 6586.1
## - Measles 1 88.3 38461 6588.0
## - Total.expenditure 1 111.9 38485 6589.5
## - thinness..1.19.years 1 125.1 38498 6590.3
## - percentage.expenditure 1 209.3 38582 6595.4
## - Alcohol 1 225.6 38599 6596.4
## - Polio 1 360.1 38733 6604.5
## - logGDP 1 482.6 38856 6611.9
## - Diphtheria 1 873.4 39246 6635.4
## - BMI 1 875.2 39248 6635.5
## - Income.composition.of.resources 1 998.5 39372 6642.9
## - infant.deaths 1 1925.1 40298 6697.4
## - under.five.deaths 1 1955.5 40329 6699.2
## - Schooling 1 3052.4 41425 6762.1
## - HIV.AIDS 1 8273.2 46646 7040.3
## - Adult.Mortality 1 9114.8 47488 7082.2
#summary for each model
up = summary(stepup)
up
##
## Call:
## lm(formula = Life.expectancy ~ Adult.Mortality + infant.deaths +
## Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## thinness.5.9.years + Income.composition.of.resources + Schooling +
## logGDP, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.0294 -2.1028 0.0757 2.3710 16.0441
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.188e+01 6.977e-01 74.359 < 2e-16 ***
## Adult.Mortality -2.091e-02 8.887e-04 -23.525 < 2e-16 ***
## infant.deaths 9.970e-02 9.251e-03 10.777 < 2e-16 ***
## Alcohol 1.017e-01 2.737e-02 3.715 0.000208 ***
## percentage.expenditure 1.944e-04 5.455e-05 3.563 0.000374 ***
## Measles -1.934e-05 8.491e-06 -2.277 0.022848 *
## BMI 4.111e-02 5.613e-03 7.324 3.31e-13 ***
## under.five.deaths -7.429e-02 6.831e-03 -10.876 < 2e-16 ***
## Polio 2.327e-02 4.977e-03 4.674 3.12e-06 ***
## Total.expenditure 9.986e-02 3.776e-02 2.644 0.008237 **
## Diphtheria 3.639e-02 5.012e-03 7.260 5.25e-13 ***
## HIV.AIDS -4.473e-01 1.996e-02 -22.414 < 2e-16 ***
## thinness..1.19.years -1.103e-01 5.568e-02 -1.980 0.047825 *
## thinness.5.9.years 4.121e-02 5.495e-02 0.750 0.453338
## Income.composition.of.resources 5.677e+00 7.297e-01 7.780 1.08e-14 ***
## Schooling 6.622e-01 4.874e-02 13.586 < 2e-16 ***
## logGDP 3.792e-01 6.968e-02 5.443 5.80e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.06 on 2327 degrees of freedom
## Multiple R-squared: 0.8186, Adjusted R-squared: 0.8174
## F-statistic: 656.5 on 16 and 2327 DF, p-value: < 2.2e-16
down = summary(stepdown)
down
##
## Call:
## lm(formula = Life.expectancy ~ Adult.Mortality + infant.deaths +
## Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## Income.composition.of.resources + Schooling + logGDP, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.0079 -2.1041 0.0671 2.3681 16.0263
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.193e+01 6.942e-01 74.809 < 2e-16 ***
## Adult.Mortality -2.089e-02 8.883e-04 -23.515 < 2e-16 ***
## infant.deaths 9.992e-02 9.246e-03 10.807 < 2e-16 ***
## Alcohol 1.012e-01 2.736e-02 3.700 0.000221 ***
## percentage.expenditure 1.944e-04 5.454e-05 3.564 0.000373 ***
## Measles -1.963e-05 8.482e-06 -2.314 0.020753 *
## BMI 4.061e-02 5.573e-03 7.287 4.32e-13 ***
## under.five.deaths -7.439e-02 6.830e-03 -10.892 < 2e-16 ***
## Polio 2.326e-02 4.977e-03 4.674 3.12e-06 ***
## Total.expenditure 9.821e-02 3.769e-02 2.606 0.009231 **
## Diphtheria 3.647e-02 5.010e-03 7.279 4.57e-13 ***
## HIV.AIDS -4.469e-01 1.995e-02 -22.403 < 2e-16 ***
## thinness..1.19.years -7.361e-02 2.672e-02 -2.755 0.005920 **
## Income.composition.of.resources 5.679e+00 7.296e-01 7.783 1.05e-14 ***
## Schooling 6.630e-01 4.872e-02 13.608 < 2e-16 ***
## logGDP 3.765e-01 6.957e-02 5.411 6.91e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.06 on 2328 degrees of freedom
## Multiple R-squared: 0.8186, Adjusted R-squared: 0.8174
## F-statistic: 700.3 on 15 and 2328 DF, p-value: < 2.2e-16
both = summary(stepboth)
both
##
## Call:
## lm(formula = Life.expectancy ~ Adult.Mortality + infant.deaths +
## Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## Income.composition.of.resources + Schooling + logGDP, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.0079 -2.1041 0.0671 2.3681 16.0263
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.193e+01 6.942e-01 74.809 < 2e-16 ***
## Adult.Mortality -2.089e-02 8.883e-04 -23.515 < 2e-16 ***
## infant.deaths 9.992e-02 9.246e-03 10.807 < 2e-16 ***
## Alcohol 1.012e-01 2.736e-02 3.700 0.000221 ***
## percentage.expenditure 1.944e-04 5.454e-05 3.564 0.000373 ***
## Measles -1.963e-05 8.482e-06 -2.314 0.020753 *
## BMI 4.061e-02 5.573e-03 7.287 4.32e-13 ***
## under.five.deaths -7.439e-02 6.830e-03 -10.892 < 2e-16 ***
## Polio 2.326e-02 4.977e-03 4.674 3.12e-06 ***
## Total.expenditure 9.821e-02 3.769e-02 2.606 0.009231 **
## Diphtheria 3.647e-02 5.010e-03 7.279 4.57e-13 ***
## HIV.AIDS -4.469e-01 1.995e-02 -22.403 < 2e-16 ***
## thinness..1.19.years -7.361e-02 2.672e-02 -2.755 0.005920 **
## Income.composition.of.resources 5.679e+00 7.296e-01 7.783 1.05e-14 ***
## Schooling 6.630e-01 4.872e-02 13.608 < 2e-16 ***
## logGDP 3.765e-01 6.957e-02 5.411 6.91e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.06 on 2328 degrees of freedom
## Multiple R-squared: 0.8186, Adjusted R-squared: 0.8174
## F-statistic: 700.3 on 15 and 2328 DF, p-value: < 2.2e-16
fitFull
##
## Call:
## lm(formula = Life.expectancy ~ Adult.Mortality + infant.deaths +
## Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## thinness.5.9.years + Income.composition.of.resources + Schooling +
## logGDP, data = training)
##
## Coefficients:
## (Intercept) Adult.Mortality
## 5.188e+01 -2.091e-02
## infant.deaths Alcohol
## 9.970e-02 1.017e-01
## percentage.expenditure Measles
## 1.944e-04 -1.934e-05
## BMI under.five.deaths
## 4.111e-02 -7.429e-02
## Polio Total.expenditure
## 2.327e-02 9.986e-02
## Diphtheria HIV.AIDS
## 3.639e-02 -4.473e-01
## thinness..1.19.years thinness.5.9.years
## -1.103e-01 4.121e-02
## Income.composition.of.resources Schooling
## 5.677e+00 6.622e-01
## logGDP
## 3.792e-01
olsrr::ols_step_forward_aic(fitFull)
##
## Selection Summary
## ------------------------------------------------------------------------------------------------
## Variable AIC Sum Sq RSS R-Sq Adj. R-Sq
## ------------------------------------------------------------------------------------------------
## Schooling 15530.008 108310.949 103218.899 0.51204 0.51183
## Adult.Mortality 14375.533 148508.373 63021.474 0.70207 0.70181
## HIV.AIDS 13971.630 158528.968 53000.879 0.74944 0.74912
## Diphtheria 13721.013 163943.928 47585.920 0.77504 0.77465
## BMI 13591.942 166531.793 44998.054 0.78727 0.78682
## Income.composition.of.resources 13493.362 168421.796 43108.051 0.79621 0.79568
## logGDP 13432.779 169557.522 41972.325 0.80158 0.80098
## Polio 13408.221 170030.391 41499.456 0.80381 0.80314
## thinness..1.19.years 13387.220 170435.632 41094.216 0.80573 0.80498
## Measles 13372.433 170728.879 40800.969 0.80711 0.80629
## percentage.expenditure 13360.849 170964.653 40565.194 0.80823 0.80732
## Total.expenditure 13353.576 171124.817 40405.030 0.80899 0.80800
## Alcohol 13351.222 171199.787 40330.061 0.80934 0.80828
## ------------------------------------------------------------------------------------------------
Adding Complexity to the Model
# Transformations using Polio as a test
transformations <- training
# Original
training %>% ggplot(aes(x=Polio, y = Life.expectancy)) +
geom_point() + geom_smooth () + ylab("Life Expectancy in Age")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# Transforming with Polynomial using a ^2
transformations$polio2 <- transformations$Polio^2
transformations %>% ggplot(aes(x=polio2, y=Life.expectancy)) +
geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# Transforming using Log
transformations$logPolio <- log(transformations$Polio)
transformations %>% ggplot(aes(x=logPolio, y=Life.expectancy)) +
geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# Testing if it's the same as logging inside the plot
transformations %>% ggplot(aes(x=log(Polio), y=Life.expectancy)) +
geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Creating plot functions
# Original Data plot
plot_original <- function(var1) {
return(training %>% ggplot(aes(x=var1, y = Life.expectancy)) +
geom_point() + geom_smooth () + ylab("Life Expectancy in Age")
)
}
plot_original_interaction <- function(var1) {
return(training %>% ggplot(aes(x=var1,
y = Life.expectancy,
color=Status)) +
geom_point() + geom_smooth () + ylab("Life Expectancy in Age")
)
}
plot_original(training$Polio)+
xlab("Polio Immunization Coverage among 1-year-olds")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# Creating function to plot variable as polynomials
plot_poly <- function(var1) {
return(training %>% ggplot(aes(x=var1^2,
y = Life.expectancy,
color=Status)) +
geom_point() + geom_smooth () + ylab("Life Expectancy in Age")
)
}
plot_poly(training$Polio)+
xlab("Polio Immunization Coverage among 1-year-olds")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# Creating function to plot variables as logged
plot_log <- function(var1) {
return(training %>% ggplot(aes(x=log(var1),
y = Life.expectancy,
color=Status)) +
geom_point() + geom_smooth () + ylab("Life Expectancy in Age")
)
}
plot_log(training$Polio) +
xlab("Polio Immunization Coverage among 1-year-olds")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Putting each Variable into the Function
# Infant Deaths Variable
plot_original(training$infant.deaths) + xlab("Infant Deaths per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_original_interaction(training$infant.deaths) + xlab("Infant Deaths per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_poly(training$infant.deaths) + xlab("Infant Deaths per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_log(training$infant.deaths) + xlab("Infant Deaths per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 656 rows containing non-finite values (`stat_smooth()`).
# Alcohol Variable
plot_original(training$Alcohol) +
xlab("Alcohol Consumption per capita in Litres of pure alcohol")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_original_interaction(training$Alcohol) +
xlab("Alcohol Consumption per capita in Litres of pure alcohol")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_poly(training$Alcohol) +
xlab("Alcohol Consumption per capita in Litres of pure alcohol")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_log(training$Alcohol) +
xlab("Alcohol Consumption per capita in Litres of pure alcohol")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# Percentage of Expenditures Variable
plot_original(training$percentage.expenditure) +
xlab("Expenditure on health as a percentage of Gross Domestic Product per capita")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_original_interaction(training$percentage.expenditure) +
xlab("Expenditure on health as a percentage of Gross Domestic Product per capita")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_poly(training$percentage.expenditure) +
xlab("Expenditure on health as a percentage of Gross Domestic Product per capita")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_log(training$percentage.expenditure) +
xlab("Expenditure on health as a percentage of Gross Domestic Product per capita")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 500 rows containing non-finite values (`stat_smooth()`).
# Measles Variable
plot_original(training$Measles) + xlab("Reported Measles cases per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_original_interaction(training$Measles) + xlab("Reported Measles cases per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_poly(training$Measles)+ xlab("Reported Measles cases per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_log(training$Measles)+ xlab("Reported Measles cases per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 778 rows containing non-finite values (`stat_smooth()`).
# BMI Variable
plot_original(training$BMI) + xlab("Average Body Mass Index of population")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_original_interaction(training$BMI) + xlab("Average Body Mass Index of population")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_poly(training$BMI)+ xlab("Average Body Mass Index of population")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_log(training$BMI)+ xlab("Average Body Mass Index of population")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# Under Five Deaths Variable
plot_original(training$under.five.deaths) +
xlab("Under age 5 deaths per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_original_interaction(training$under.five.deaths) +
xlab("Under age 5 deaths per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_poly(training$under.five.deaths)+
xlab("Under age 5 deaths per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_log(training$under.five.deaths)+
xlab("Under age 5 deaths per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 604 rows containing non-finite values (`stat_smooth()`).
# Total Expenditures Variable
plot_original(training$Total.expenditure) +
xlab("Percentage of total government expenditures on health")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_original_interaction(training$Total.expenditure) +
xlab("Percentage of total government expenditures on health")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_poly(training$Total.expenditure)+
xlab("Percentage of total government expenditures on health")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_log(training$Total.expenditure)+
xlab("Percentage of total government expenditures on health")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# Diphtheria Variable
plot_original(training$Diphtheria) +
xlab("DPT3 Immunization Coverage among 1-year-olds")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_original_interaction(training$Diphtheria) +
xlab("DPT3 Immunization Coverage among 1-year-olds")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_poly(training$Diphtheria)+
xlab("DPT3 Immunization Coverage among 1-year-olds")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_log(training$Diphtheria)+
xlab("DPT3 Immunization Coverage among 1-year-olds")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# HIV AIDS Variable
plot_original(training$HIV.AIDS) + xlab ("HIV/AIDS Deaths per 1000 live births")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_original_interaction(training$HIV.AIDS) + xlab ("HIV/AIDS Deaths per 1000 live births")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_poly(training$HIV.AIDS)+ xlab ("HIV/AIDS Deaths per 1000 live births")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_log(training$HIV.AIDS)+ xlab ("HIV/AIDS Deaths per 1000 live births")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# GDP Variable
plot_original(training$GDP) + xlab("Gross Domestic Product per capita in USD")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_original_interaction(training$GDP) + xlab("Gross Domestic Product per capita in USD")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_poly(training$GDP)+ xlab("Gross Domestic Product per capita in USD")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_log(training$GDP)+ xlab("Gross Domestic Product per capita in USD")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# Thinness 10-19 Years Variable
plot_original(training$thinness..1.19.years) +
xlab("Prevalence of thinness among children ages 10 to 19")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_original_interaction(training$thinness..1.19.years) +
xlab("Prevalence of thinness among children ages 10 to 19")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_poly(training$thinness..1.19.years)+
xlab("Prevalence of thinness among children ages 10 to 19")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_log(training$thinness..1.19.years)+
xlab("Prevalence of thinness among children ages 10 to 19")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# Thinness 5-9 Variable
plot_original(training$thinness.5.9.years)+
xlab("Prevalence of thinness among children ages 5 to 9")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_original_interaction(training$thinness.5.9.years)+
xlab("Prevalence of thinness among children ages 5 to 9")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_poly(training$thinness.5.9.years)+
xlab("Prevalence of thinness among children ages 5 to 9")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_log(training$thinness.5.9.years)+
xlab("Prevalence of thinness among children ages 5 to 9")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# Income Composition of Resources Variable
plot_original(training$Income.composition.of.resources) +
xlab("Human Development Index in terms of income composition of resources")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_original_interaction(training$Income.composition.of.resources) +
xlab("Human Development Index in terms of income composition of resources")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_poly(training$Income.composition.of.resources)+
xlab("Human Development Index in terms of income composition of resources")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_log(training$Income.composition.of.resources)+
xlab("Human Development Index in terms of income composition of resources")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 101 rows containing non-finite values (`stat_smooth()`).
# Schooling Variable
plot_original(training$Schooling) + xlab("Years in School")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_original_interaction(training$Schooling) + xlab("Years in School")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_poly(training$Schooling)+ xlab("Years in School")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_log(training$Schooling)+ xlab("Years in School")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 21 rows containing non-finite values (`stat_smooth()`).
Original Model for comparison
original.model <- lm(Life.expectancy~ Polio + Alcohol + BMI +
Diphtheria+ Schooling + HIV.AIDS + GDP +
thinness..1.19.years + Measles + Total.expenditure +
Income.composition.of.resources, training)
plot(original.model)
Fitting a complex model with polynomials up to 2 & logged
variables
poly2logmodel <- lm(Life.expectancy~ poly(Polio,2)+
poly(Alcohol,2)+
poly(BMI,2)+
poly(Diphtheria,2)+
poly(Schooling,2)+
log(HIV.AIDS) +
log(GDP) +
log(thinness..1.19.years) +
Measles +
Total.expenditure+
Income.composition.of.resources, training)
plot(poly2logmodel)
Fitting a complex model with polynomials up to 7 & logged variables
poly7logmodel <- lm(Life.expectancy~ poly(Polio,7)+
sqrt(Alcohol)+
poly(BMI,7)+
poly(Diphtheria,5)+
poly(Schooling,7)+
log(HIV.AIDS) +
log(GDP) +
log(thinness..1.19.years) +
Measles +
Total.expenditure+
Income.composition.of.resources, training)
plot(poly7logmodel)
Fitting interaction terms on regular model
interaction.model <- lm(Life.expectancy~ Polio:Status+Alcohol:Status+BMI:Status+
Diphtheria:Status+Schooling:Status+
HIV.AIDS:Status+GDP:Status+
thinness..1.19.years:Status+
Measles:Status+Total.expenditure:Status+
Income.composition.of.resources:Status, training)
plot(interaction.model)
Fitting interaction on poly2logmodel
poly2log.interaction.model <- lm(Life.expectancy~ poly(Polio,2):Status+
poly(Alcohol,2):Status+
poly(BMI,2):Status+
poly(Diphtheria,2):Status+
poly(Schooling,2):Status+
log(HIV.AIDS):Status +
log(GDP):Status +
log(thinness..1.19.years):Status +
Measles:Status +
Total.expenditure:Status+
Income.composition.of.resources:Status, training)
plot(poly2log.interaction.model)
Fitting interaction on poly7logmodel
poly7log.interaction.model <- lm(Life.expectancy~ poly(Polio,7):Status+
sqrt(Alcohol):Status+
poly(BMI,7):Status+
poly(Diphtheria,5):Status+
poly(Schooling,7):Status+
log(HIV.AIDS):Status +
log(GDP):Status +
log(thinness..1.19.years):Status +
Measles:Status +
Total.expenditure:Status+
Income.composition.of.resources:Status, training)
plot(poly7log.interaction.model)
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
Fitting a complex model with polynomials up to 7 & logged variables and categorical variable Status
status.poly7logmodel <- lm(Life.expectancy~ Status + poly(Polio,7)+
sqrt(Alcohol)+
poly(BMI,7)+
poly(Diphtheria,5)+
poly(Schooling,7)+
log(HIV.AIDS) +
log(GDP) +
log(thinness..1.19.years) +
Measles +
Total.expenditure+
Income.composition.of.resources, training)
plot(status.poly7logmodel)
Simple Model
simple <- lm(Life.expectancy ~ HIV.AIDS + Alcohol:Status + Total.expenditure, training)
plot(simple)
Predictions
# RMSE of Full Model
full.model_Preds = predict(full.model, newdata = validate)
MSPE = data.frame(Observed = validate$Life.expectancy,
Predicted = full.model_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 4.485943
# RMSE of Penalized Regression Model
glmnet.fit.model_Preds = predict(glmnet.fit.model, newdata = validate)
MSPE = data.frame(Observed = validate$Life.expectancy,
Predicted = glmnet.fit.model_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 4.40931
# RMSE of Forward Selection Model
fwd.select.model_Preds = predict(fwd.select.model, newdata = validate)
MSPE = data.frame(Observed = validate$Life.expectancy,
Predicted = fwd.select.model_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 4.528587
# RMSE of Backward Selection Model
bck.select.model_Preds = predict(bck.select.model, newdata = validate)
MSPE = data.frame(Observed = validate$Life.expectancy,
Predicted = bck.select.model_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 4.412429
# RMSE of Stepwise Selection Model
sw.select.model_Preds = predict(sw.select.model, newdata = validate)
MSPE = data.frame(Observed = validate$Life.expectancy,
Predicted = sw.select.model_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 4.521186
# RMSE of Original Model
original.model_Preds = predict(original.model, newdata = validate)
MSPE = data.frame(Observed = validate$Life.expectancy,
Predicted = original.model_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 4.548685
# RMSE of Poly2Log Model
poly2logmodel_Preds = predict(poly2logmodel, newdata = validate)
MSPE = data.frame(Observed = validate$Life.expectancy,
Predicted = poly2logmodel_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 3.80083
# RMSE of Poly7Log Model
poly7logmodel_Preds = predict(poly7logmodel, newdata = validate)
MSPE = data.frame(Observed = validate$Life.expectancy,
Predicted = poly7logmodel_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 3.734417
# RMSE of Original with Interaction Model
interaction.model_Preds = predict(interaction.model, newdata = validate)
MSPE = data.frame(Observed = validate$Life.expectancy,
Predicted = interaction.model_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 4.369254
# RMSE of Poly2Log Model with interactions
poly2log.interaction.model_Preds = predict(poly2log.interaction.model, newdata = validate)
MSPE = data.frame(Observed = validate$Life.expectancy,
Predicted = poly2log.interaction.model_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 3.671729
# RMSE of Poly7Log Model with interactions
poly7log.interaction.model_Preds = predict(poly7log.interaction.model, newdata = validate)
MSPE = data.frame(Observed = validate$Life.expectancy,
Predicted = poly7log.interaction.model_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 3.533711
# RMSE of Poly7Log Model with Categorical variable Status added (not as interaction)
status.poly7logmodel_Preds = predict(status.poly7logmodel, newdata = validate)
MSPE = data.frame(Observed = validate$Life.expectancy,
Predicted = status.poly7logmodel_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 3.702815
#knn with training
library(caret)
fit_cont1 = trainControl(method = "repeatedcv", number = 10, repeats = 1)
set.seed(1364)
knnfit1 = train(Life.expectancy~Adult.Mortality + infant.deaths + Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths + Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + thinness.5.9.years + Income.composition.of.resources + Schooling + logGDP, data =training, method = "knn", trControl = fit_cont1, tuneGrid = expand.grid(k = c(1:30)))
plot(knnfit1)
updateval = validate[,c("Life.expectancy", "Adult.Mortality", "infant.deaths", "Alcohol", "percentage.expenditure", "Measles", "BMI", "under.five.deaths", "Polio", "Total.expenditure", "Diphtheria", "HIV.AIDS", "thinness..1.19.years","thinness.5.9.years", "Income.composition.of.resources", "Schooling", "logGDP")]
prediction = predict(knnfit1, newdata = updateval)
MSPE = data.frame(Observed = validate$Life.expectancy, Predicted = prediction)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 4.840354
#prediction = predict(knnfit1, newdata = validate)
#cf = confusionMatrix(prediction, updateval$Life.expectancy)
#cf
#print(cf)